library(cregg)
library(dplyr)
library(ggplot2)
library(stringr)
library(formula.tools)
library(stringr)
library(patchwork)
library(tikzDevice)
library(estimatr)
library(lmtest)

# Reproduces the results with separate estimates for each tasks numbers. 
# this is a test of the stability assumption from Hainmueller et al. 2014
# these results are found in the appendix

fix_duplicate_level_names <- function(cregg_function_input, formula_to_be_used){
  # the cregg package does not accept attributes with the same level name. 
  # this function temporarily adds a special character to the duplicate level names
  
  variables_to_be_used <- rhs.vars(formula_to_be_used)
  variables_to_be_used <- str_remove_all(variables_to_be_used, '`')
  
  variables_used_in_conjoint <- dplyr::select(survey_data, all_of(variables_to_be_used))
  
  observed_level_names <- c()
  
  for(variable in variables_to_be_used){
    
    attribute <- variables_used_in_conjoint[, variable]
    level_names <- levels(attribute)
    
    level_names_in_other_attribute_mask <- level_names %in% observed_level_names
    level_names_in_other_attribute <- level_names[level_names_in_other_attribute_mask]
    
    if (length(level_names_in_other_attribute) > 0) {
      
      corrected_name <- paste0(level_names_in_other_attribute, "@")
      names(corrected_name) <- level_names_in_other_attribute
      
      attribute <- recode_factor(attribute, !!!corrected_name)
      
    }
    
    previously_unobserved_lvl_names <- level_names[!level_names_in_other_attribute_mask]
    observed_level_names <- c(observed_level_names, previously_unobserved_lvl_names)
    
    survey_data[, variable] <- attribute  
    
  }
  
  return(survey_data)
}

fix_level_name_in_cregg_output <- function(cregg_output){
  # remove special character that was introduced to avoid duplicate level names 
  # when calculating marginal means 
  cregg_output <- cregg_output %>% 
    mutate(level = stringr::str_remove_all(string = as.character(level), pattern = "@"))
  cregg_output <- cregg_output %>% 
    mutate(level = as.factor(level))
  return(cregg_output)
}


estimate_nation_outcome_helper <- function(task_dataframe){
  
  # helper function to estimate the nation outcome
  # used in a lapply call 
  good_nation_output <- cregg::cj(
    formula = good_nation_formula,
    data = task_dataframe,
    id = ~ResponseId,
    estimate = "mm"
  )
  
  return(good_nation_output)
}

estimate_vote_choice_outcome_helper <- function(task_dataframe){
  
  # helper function to estimate the region outcome
  vote_choice_output <- cregg::cj(
    formula = vote_choice_formula,
    data = task_dataframe,
    id = ~ResponseId,
    estimate = "mm"
  )
  
  return(vote_choice_output)
}

vote_choice_plot_helper <- function(task_data, selected_task){
  # makes plot in a given task 
  # helper for lapply
  
  task_plot_data <- task_data %>%
    filter(feature %in% c("Father's Job", "University", "Occupation", "High School")) %>% 
    filter(task_number == selected_task) 
  
  
  task_plot <- task_plot_data%>% 
    ggplot(aes(y = level, x = estimate, xmin = lower, xmax = upper)) +
    geom_point() +
    geom_errorbar(width=0) + 
    geom_vline(xintercept = 0.5, linetype = "dashed") +
    facet_wrap(.~feature, scales = "free_y", strip.position = "top", ncol = 1) + 
    theme(axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.title.x = element_text(size = 8),
          axis.text.x = element_text(size = 6),
          legend.position = "none") +
    scale_x_continuous(name = paste0("Estimate ", "task ", selected_task),
                       limits = c(0.35, 0.65),
                       breaks = seq(0.05, 1.05, by = 0.1))
  
  return(task_plot)
}

estimate_region_outcome_helper <- function(task_dataframe){
  good_region_output <- cregg::cj(
    formula = good_region_formula,
    data = task_dataframe,
    id = ~ResponseId,
    estimate = "mm"
  )
  
  return(good_region_output)
}

region_outcome_plot_helper <- function(cregg_output_region, selected_task){
  # makes plot in a given task 
  # helper for lapply
  
  region_plot <- cregg_output_region %>% 
    filter(feature %in% c("Father's Job", "University", "Occupation", "High School")) %>% 
    filter(task_number == selected_task) %>% 
    ggplot(aes(x = estimate, y = level)) + 
    geom_point() + 
    geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) + 
    facet_wrap(~feature, dir = "v", strip.position = "top", scales = "free_y", ncol = 1) +
    scale_x_continuous(name = paste0("Estimate Task ", selected_task), 
                       limits = c(0.35, 0.65),
                       breaks = seq(0.05, 1.05, by = 0.1)) + 
    geom_vline(xintercept = 0.5, linetype = "dashed") +
    theme(axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.title.x = element_text(size = 8),
          axis.text.x = element_text(size = 6),
          legend.position = "none") +
    theme(legend.position = "none")
  
  return(region_plot)
}


estimate_mm_per_tasks <- function(task_data, helper_function){
  # estimate marginal means in a given task
  
  mm_per_task <- lapply(task_data, helper_function)
  mm_per_task <- lapply(1:8, function(x) mutate(mm_per_task[[x]], task_number = x))
  mm_per_task <- data.table::rbindlist(mm_per_task)
  mm_per_task <- fix_level_name_in_cregg_output(mm_per_task)
  
  return(mm_per_task)
}

nation_outcome_plot_helper <- function(cregg_output_nation, selected_task){
  # makes plot in a given task 
  # helper for lapply
  
  nation_plot <- cregg_output_nation %>% 
    filter(feature %in% c("Father's Job", "University", "Occupation", "High School")) %>% 
    filter(task_number == selected_task) %>% 
    ggplot(aes(x = estimate, y = level)) + 
    geom_point() + 
    geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) + 
    facet_wrap(~feature, dir = "v", strip.position = "top", scales = "free_y", ncol = 1) +
    scale_x_continuous(name = paste0("Estimate Task ", selected_task), 
                       limits = c(0.35, 0.65),
                       breaks = seq(0, 1, by = 0.1)) + 
    geom_vline(xintercept = 0.5, linetype = "dashed") +
    theme(axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.title.x = element_text(size = 8),
          axis.text.x = element_text(size = 6),
          legend.position = "none")
  
  return(nation_plot)
}


path_to_data <- "data/first_survey_formatted.rds"

path_to_figures <- "figures"
paper_figures_folder <- "paper_plots"
appendix_figures_folder <- "appendix_plots"

vote_choice_nocarryover <- "vote_choice_carryover.eps"
region_nocarryover <- "region_carryover.eps"
nation_nocarryover <- "nation_carryover.eps"

vote_choice_nocarryover_first_task <- "vote_choice_carryover_first_task.eps"
region_nocarryover_first_task <- "region_carryover_first_task.eps"
nation_nocarryover_first_task <- "nation_carryover_first_task.eps"

survey_data <- readRDS(path_to_data)

good_region_formula <- 
  good_region ~ `Father's Job` + `Incumbent Status` + Gender + `Years in Employment` + University + Occupation + `High School` + `Policy Position`
good_nation_formula <- 
  good_country ~ `Father's Job` + `Incumbent Status` + Gender + `Years in Employment` + University + Occupation + `High School` + `Policy Position`
vote_choice_formula <- 
  vote_choice ~ `Father's Job` + `Incumbent Status` + Gender + `Years in Employment` + University + Occupation + `High School` + `Policy Position`


survey_data <- fix_duplicate_level_names(survey_data, good_region_formula)

# split data by task number
task_num_split <- survey_data %>% 
  dplyr::group_split(task_number)

vote_choice_mms_by_task <- estimate_mm_per_tasks(task_num_split, estimate_vote_choice_outcome_helper)
vote_task_plots <- lapply(2:8, function(x) vote_choice_plot_helper(vote_choice_mms_by_task, x))

vote_first_task_plot <- vote_choice_mms_by_task %>% 
  filter(feature %in% c("Father's Job", "University", "Occupation", "High School")) %>% 
  filter(task_number == 1) %>% 
  ggplot(aes(y = level, x = estimate, xmin = lower, xmax = upper)) +
  geom_point() +
  geom_errorbar(width=0) + 
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  facet_wrap(.~feature, scales = "free_y", strip.position = "top", ncol = 1) + 
  theme(
        legend.position = "none", 
        axis.title.x = element_text(size = 8),
        axis.text.x = element_text(size = 6)) + 
  scale_x_continuous(name = paste0("Estimate ", "task ", "1"),
                     limits = c(0.35, 0.65), 
                     breaks = seq(0.05, 1.05, by = 0.1))

no_carryover_assumption_vote_choice_plot <- 
  vote_first_task_plot + 
  vote_task_plots[[1]] + 
  vote_task_plots[[2]] + 
  vote_task_plots[[3]] + 
  vote_task_plots[[4]] +
  vote_task_plots[[5]] +
  vote_task_plots[[6]] + 
  vote_task_plots[[7]] +
  plot_layout(ncol = 8)

region_mm_per_task <- estimate_mm_per_tasks(task_num_split, estimate_region_outcome_helper)
region_task_plots <- lapply(2:8, function(x) region_outcome_plot_helper(region_mm_per_task, x))

region_first_task_plot <- region_mm_per_task %>% 
  filter(feature %in% c("Father's Job", "University", "Occupation", "High School")) %>% 
  filter(task_number == 1) %>% 
  ggplot(aes(x = estimate, y = level)) + 
  geom_point() + 
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) + 
  facet_wrap(~feature, dir = "v", strip.position = "top", scales = "free_y", ncol = 1) +
  scale_x_continuous(name = "Estimate Task 1", 
                     limits = c(0.35, 0.65),
                     breaks = seq(0.05, 1.05, by = 0.1)) + 
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  theme(legend.position = "none", 
        axis.title.x = element_text(size = 8),
        axis.text.x = element_text(size = 6)) 

no_carryover_assumption_region_plot <- 
  region_first_task_plot + 
  region_task_plots[[1]] + 
  region_task_plots[[2]] + 
  region_task_plots[[3]] + 
  region_task_plots[[4]] +
  region_task_plots[[5]] +
  region_task_plots[[6]] + 
  region_task_plots[[7]] +
  plot_layout(ncol = 8)

nation_mm_per_task <- estimate_mm_per_tasks(task_num_split, estimate_nation_outcome_helper)
nation_task_plots <- lapply(2:8, function(x) nation_outcome_plot_helper(nation_mm_per_task, x))

nation_first_task_plot <- nation_mm_per_task %>% 
  filter(feature %in% c("Father's Job", "University", "Occupation", "High School")) %>% 
  filter(task_number == 1) %>% 
  ggplot(aes(x = estimate, y = level)) + 
  geom_point() + 
  geom_errorbar(aes(xmin = lower, xmax = upper), width = 0) + 
  facet_wrap(~feature, dir = "v", strip.position = "top", scales = "free_y", ncol = 1) +
  scale_x_continuous(name = "Estimate Task 1", 
                     limits = c(0.40, 0.60),
                     breaks = seq(0.05, 1.05, by = 0.05)) + 
  geom_vline(xintercept = 0.5, linetype = "dashed") +
  theme(legend.position = "none",
        axis.title.x = element_text(size = 8),
        axis.text.x = element_text(size = 6)) 

no_carryover_assumption_nation_plot <- 
  nation_first_task_plot + 
  nation_task_plots[[1]] + 
  nation_task_plots[[2]] + 
  nation_task_plots[[3]] + 
  nation_task_plots[[4]] +
  nation_task_plots[[5]] +
  nation_task_plots[[6]] + 
  nation_task_plots[[7]] +
  plot_layout(ncol = 8)


ggsave(
  filename = file.path(path_to_figures, appendix_figures_folder, vote_choice_nocarryover),
  plot = no_carryover_assumption_vote_choice_plot,
  height = 7, 
  width =11,
  units = "in",
  device = cairo_ps,
  fallback_resolution = 200)

ggsave(
  filename = file.path(path_to_figures, appendix_figures_folder, region_nocarryover),
  plot = no_carryover_assumption_region_plot,
  height = 7, 
  width =11,
  units = "in",
  device = cairo_ps,
  fallback_resolution = 200)

ggsave(
  filename = file.path(path_to_figures, appendix_figures_folder, nation_nocarryover),
  plot = no_carryover_assumption_nation_plot,
  height = 7, 
  width =11,
  units = "in",
  device = cairo_ps,
  fallback_resolution = 200)

ggsave(
  filename = file.path(path_to_figures, appendix_figures_folder, vote_choice_nocarryover_first_task),
  plot = vote_first_task_plot,
  height = 8, 
  width =6,
  units = "in",
  device = cairo_ps,
  fallback_resolution = 200)

ggsave(
  filename = file.path(path_to_figures, appendix_figures_folder, region_nocarryover_first_task),
  plot = region_first_task_plot,
  height = 8, 
  width =6,
  units = "in",
  device = cairo_ps,
  fallback_resolution = 200)

ggsave(
  filename = file.path(path_to_figures, appendix_figures_folder, nation_nocarryover_first_task),
  plot = nation_first_task_plot,
  height = 8, 
  width =6,
  units = "in",
  device = cairo_ps,
  fallback_resolution = 200)


## F-Tests 

survey_data$ResponseId <- as.factor(survey_data$ResponseId)
survey_data$task_number <- as.factor(survey_data$task_number)


task_number_uni_model <- lm_robust(vote_choice ~ University*task_number, clusters = ResponseId, data = survey_data) 
uni_model <- lm_robust(vote_choice ~ `University` + task_number, clusters = ResponseId, data = survey_data) 

task_number_ocu_model <- lm_robust(vote_choice ~ Occupation*task_number, clusters = ResponseId, data = survey_data) 
ocu_model <- lm_robust(vote_choice ~ `Occupation` + task_number, clusters = ResponseId, data = survey_data) 

task_number_hs_model <- lm_robust(vote_choice ~ `High School`*task_number, clusters = ResponseId, data = survey_data) 
hs_model <- lm_robust(vote_choice ~ `High School` + task_number, clusters = ResponseId, data = survey_data) 

task_number_fj_model <- lm_robust(vote_choice ~ `Father's Job`*task_number, data = survey_data) 
fj_model <- lm_robust(vote_choice ~ `Father's Job` + task_number, clusters = ResponseId, data = survey_data) 


task_number_uni_model_region <- lm_robust(good_region ~ University*task_number, clusters = ResponseId, data = survey_data) 
uni_model_region <- lm_robust(good_region ~ `University` + task_number, clusters = ResponseId, data = survey_data) 

task_number_ocu_model_region <- lm_robust(good_region ~ Occupation*task_number, clusters = ResponseId, data = survey_data) 
ocu_model_region <- lm_robust(good_region ~ `Occupation` + task_number, clusters = ResponseId, data = survey_data) 

task_number_hs_model_region <- lm_robust(good_region ~ `High School`*task_number, clusters = ResponseId, data = survey_data) 
hs_model_region <- lm_robust(good_region ~ `High School` + task_number, clusters = ResponseId, data = survey_data) 

task_number_fj_model_region <- lm_robust(good_region ~ `Father's Job`*task_number, clusters = ResponseId, data = survey_data) 
fj_model_region <- lm_robust(good_region ~ `Father's Job` + task_number, clusters = ResponseId, data = survey_data) 


task_number_uni_model_country <- lm_robust(good_country ~ University*task_number, clusters = ResponseId, data = survey_data) 
uni_model_country <- lm_robust(good_country ~ `University` + task_number, clusters = ResponseId, data = survey_data) 

task_number_ocu_model_country <- lm_robust(good_country ~ Occupation*task_number, clusters = ResponseId, data = survey_data) 
ocu_model_country <- lm_robust(good_country ~ `Occupation` + task_number, clusters = ResponseId, data = survey_data) 

task_number_hs_model_country <- lm_robust(good_country ~ `High School`*task_number, clusters = ResponseId, data = survey_data) 
hs_model_country <- lm_robust(good_country ~ `High School` + task_number, clusters = ResponseId, data = survey_data) 

task_number_fj_model_country <- lm_robust(good_country ~ `Father's Job`*task_number, clusters = ResponseId, data = survey_data) 
fj_model_country <- lm_robust(good_country ~ `Father's Job` + task_number, clusters = ResponseId, data = survey_data) 


ftest_vc_uni <- waldtest(task_number_uni_model, uni_model, test = "F")
ftest_vc_ocu <- waldtest(task_number_ocu_model, ocu_model, test = "F")
ftest_vc_hs <- waldtest(task_number_hs_model, hs_model, test = "F")
ftest_vc_fj <- waldtest(task_number_fj_model, fj_model, test = "F")

ftest_gr_uni <- waldtest(task_number_uni_model_region, uni_model_region, test = "F")
ftest_gr_ocu <- waldtest(task_number_ocu_model_region, ocu_model_region, test = "F")
ftest_gr_hs <- waldtest(task_number_hs_model_region, hs_model_region, test = "F")
ftest_gr_fj <- waldtest(task_number_fj_model_region, fj_model_region, test = "F")

ftest_gc_uni <- waldtest(task_number_uni_model_country, uni_model_country, test = "F")
ftest_gc_ocu <- waldtest(task_number_ocu_model_country, ocu_model_country, test = "F")
ftest_gc_hs <- waldtest(task_number_hs_model_country, hs_model_country, test = "F")
ftest_gc_fj <- waldtest(task_number_fj_model_country, fj_model_country, test = "F")

ftests <- list(
  ftest_vc_uni,
  ftest_vc_ocu,
  ftest_vc_hs,
  ftest_vc_fj,
  ftest_gr_uni,
  ftest_gr_ocu,
  ftest_gr_hs,
  ftest_gr_fj,
  ftest_gc_uni,
  ftest_gc_ocu,
  ftest_gc_hs,
  ftest_gc_fj
  
)

fstats <- sapply(ftests, function(x) x$F[2])
pvals <- sapply(ftests, function(x) x$`Pr(>F)`[2])

fstats <- round(fstats, digits = 2)
pvals <- round(pvals, digits = 2)

fstats_table_content <- paste0(fstats, " (p=" , pvals, ")")

outcome <- c("Vote choice", "Regional Welfare", "National Welfare")
uni <- fstats_table_content[c(1, 5, 9)]
ocu <- fstats_table_content[c(2, 6, 10)]
hs  <- fstats_table_content[c(3, 7, 11)]
fj <- fstats_table_content[c(4, 8, 12)]

fstat_table <- as.data.frame(cbind(outcome, uni, ocu, hs, fj))
colnames(fstat_table) <- c("Outcome Variable", "University F Stat", "Occupation F Stat", "High School F Stat", "Father's Job F Stat")

print(kableExtra::kable(fstat_table, format = "latex"))











